#Importing required libraries
import pandas as pd
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import datetime, warnings, scipy
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# We have identified the small airlines based on the cutoff <20000 number of flights and the result is 10 airlines
#Frontier, GoJet,Alaska,AirWisconsin,Commute,Hawaiian,Horizon Air,Mesa,Piedmont,Spirit Airlines
#We created a database table with all the data and used a view which contains below columns. For ease of our analysis we created separate csv files for each small airline and loaded them to the dataframe
#Loading the dataframes for each small airline as listed above
frontier=pd.read_csv('/content/drive/MyDrive/Data/F9.csv')
gojet=pd.read_csv('/content/drive/MyDrive/Data/G7.csv')
alaska=pd.read_csv('/content/drive/MyDrive/Data/AS.csv')
airwis=pd.read_csv('/content/drive/MyDrive/Data/ZW.csv')
commute=pd.read_csv('/content/drive/MyDrive/Data/C5.csv')
hawa=pd.read_csv('/content/drive/MyDrive/Data/HA.csv')
pied=pd.read_csv('/content/drive/MyDrive/Data/PT.csv')
spirit=pd.read_csv('/content/drive/MyDrive/Data/NK.csv')
horizon=pd.read_csv('/content/drive/MyDrive/Data/QX.csv')
mesa=pd.read_csv('/content/drive/MyDrive/Data/YV.csv')
#Now that we have the data for each airline we will be merging the dataframes to perform EDA
#Note:- We again break this merged data frame into precovid and postcovid as it is easier to handle data that way due to low computational power of our machines
df=[frontier,gojet,alaska,airwis,commute,hawa,pied,spirit,horizon,mesa]
small_airlines = pd.concat(df, ignore_index=True, sort=False)
small_airlines
| year | month | flightdate | DayOfWeek | DayofMonth | Operating_Airline | OriginCityName | OriginStateName | DestCityName | DestStateName | ... | SecurityDelay | LateAircraftDelay | TaxiIn | TaxiOut | WheelsOff | WheelsOn | Origin | Dest | Tail_Number | Description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 2 | 2022-02-01 | 2 | 1 | F9 | Las Vegas, NV | Nevada | San Diego, CA | California | ... | NaN | NaN | 5.0 | 15.0 | 1807 | 1857 | LAS | SAN | N206FR | Frontier Airlines Inc. |
| 1 | 2022 | 2 | 2022-02-01 | 2 | 1 | F9 | San Diego, CA | California | Las Vegas, NV | Nevada | ... | NaN | NaN | 11.0 | 18.0 | 2009 | 2057 | SAN | LAS | N206FR | Frontier Airlines Inc. |
| 2 | 2022 | 2 | 2022-02-01 | 2 | 1 | F9 | Las Vegas, NV | Nevada | Orlando, FL | Florida | ... | 0.0 | 0.0 | 12.0 | 16.0 | 33 | 707 | LAS | MCO | N206FR | Frontier Airlines Inc. |
| 3 | 2022 | 2 | 2022-02-01 | 2 | 1 | F9 | Denver, CO | Colorado | Fort Myers, FL | Florida | ... | NaN | NaN | 10.0 | 16.0 | 55 | 600 | DEN | RSW | N207FR | Frontier Airlines Inc. |
| 4 | 2022 | 2 | 2022-02-01 | 2 | 1 | F9 | Fort Myers, FL | Florida | Buffalo, NY | New York | ... | NaN | NaN | 7.0 | 24.0 | 742 | 1018 | RSW | BUF | N207FR | Frontier Airlines Inc. |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5975536 | 2022 | 1 | 2022-01-03 | 1 | 3 | YV | Cedar Rapids/Iowa City, IA | Iowa | Dallas/Fort Worth, TX | Texas | ... | 0.0 | 60.0 | 31.0 | 13.0 | 1755 | 1933 | CID | DFW | N957LR | Mesa Airlines Inc. |
| 5975537 | 2022 | 1 | 2022-01-01 | 6 | 1 | YV | Dallas/Fort Worth, TX | Texas | Mobile, AL | Alabama | ... | NaN | NaN | 3.0 | 24.0 | 1802 | 1909 | DFW | MOB | N958LR | Mesa Airlines Inc. |
| 5975538 | 2022 | 1 | 2022-01-02 | 7 | 2 | YV | Dallas/Fort Worth, TX | Texas | Mobile, AL | Alabama | ... | 0.0 | 0.0 | 6.0 | 18.0 | 1903 | 2014 | DFW | MOB | N919FJ | Mesa Airlines Inc. |
| 5975539 | 2022 | 1 | 2022-01-01 | 6 | 1 | YV | Mobile, AL | Alabama | Dallas/Fort Worth, TX | Texas | ... | NaN | NaN | 22.0 | 9.0 | 1958 | 2134 | MOB | DFW | N958LR | Mesa Airlines Inc. |
| 5975540 | 2022 | 1 | 2022-01-02 | 7 | 2 | YV | Mobile, AL | Alabama | Dallas/Fort Worth, TX | Texas | ... | 0.0 | 60.0 | 12.0 | 10.0 | 2133 | 2302 | MOB | DFW | N919FJ | Mesa Airlines Inc. |
5975541 rows × 34 columns
#Identifying unique values for the data
for col in small_airlines.select_dtypes(include='object').columns:
print(col)
print(small_airlines[col].unique())
flightdate ['2022-02-01' '2022-02-02' '2022-02-03' ... '2022-01-29' '2022-01-30' '2022-01-31'] Operating_Airline ['F9' 'G7' 'AS' 'ZW' 'C5' 'HA' 'PT' 'NK' 'QX' 'YV'] OriginCityName ['Las Vegas, NV' 'San Diego, CA' 'Denver, CO' 'Fort Myers, FL' 'Buffalo, NY' 'Orlando, FL' 'Miami, FL' 'Philadelphia, PA' 'Tampa, FL' 'Cleveland, OH' 'Baltimore, MD' 'Phoenix, AZ' 'Trenton, NJ' 'West Palm Beach/Palm Beach, FL' 'Chicago, IL' 'Atlanta, GA' 'Islip, NY' 'Grand Rapids, MI' 'Houston, TX' 'Little Rock, AR' 'Boston, MA' 'Washington, DC' 'Reno, NV' 'Providence, RI' 'Dallas/Fort Worth, TX' 'Minneapolis, MN' 'Madison, WI' 'Ontario, CA' 'Salt Lake City, UT' 'Sacramento, CA' 'Oakland, CA' 'Raleigh/Durham, NC' 'San Juan, PR' 'Santa Ana, CA' 'San Francisco, CA' 'New Orleans, LA' 'Portland, ME' 'Portland, OR' 'Cincinnati, OH' 'Newark, NJ' 'Hartford, CT' 'Charlotte, NC' 'Omaha, NE' 'Detroit, MI' 'Fayetteville, AR' 'Seattle, WA' 'Kansas City, MO' 'St. Louis, MO' 'Pensacola, FL' 'Syracuse, NY' 'Indianapolis, IN' 'Rochester, NY' 'Jacksonville, FL' 'New York, NY' 'Burbank, CA' 'Milwaukee, WI' 'Austin, TX' 'Cedar Rapids/Iowa City, IA' 'Colorado Springs, CO' 'Norfolk, VA' 'Harrisburg, PA' 'Nashville, TN' 'Tulsa, OK' 'Knoxville, TN' 'Newburgh/Poughkeepsie, NY' 'Fresno, CA' 'Albuquerque, NM' 'Sioux Falls, SD' 'El Paso, TX' 'Green Bay, WI' 'Fargo, ND' 'Bloomington/Normal, IL' 'Des Moines, IA' 'Harlingen/San Benito, TX' 'Tucson, AZ' 'San Antonio, TX' 'Huntsville, AL' 'Wilmington, DE' 'Oklahoma City, OK' 'Durango, CO' 'Sarasota/Bradenton, FL' 'Spokane, WA' 'Columbus, OH' 'Memphis, TN' 'Louisville, KY' 'Burlington, VT' 'Albany, NY' 'Pittsburgh, PA' 'Fort Lauderdale, FL' 'Charleston, SC' 'Charlotte Amalie, VI' 'Bismarck/Mandan, ND' 'Myrtle Beach, SC' 'Aguadilla, PR' 'Savannah, GA' 'Grand Junction, CO' 'Branson, MO' 'Kalispell, MT' 'Missoula, MT' 'Bozeman, MT' 'Los Angeles, CA' 'Palm Springs, CA' 'San Jose, CA' 'Billings, MT' 'Wichita, KS' 'Jackson, WY' 'Jackson/Vicksburg, MS' 'Boise, ID' 'Anchorage, AK' 'Greer, SC' 'Mobile, AL' 'Tyler, TX' 'Santa Barbara, CA' 'Portsmouth, NH' 'Lafayette, LA' 'Birmingham, AL' 'Greensboro/High Point, NC' 'Springfield, MO' 'Presque Isle/Houlton, ME' 'Richmond, VA' 'Traverse City, MI' 'Manchester, NH' 'Wilmington, NC' 'Akron, OH' 'Hilton Head, SC' 'Flint, MI' 'Scranton/Wilkes-Barre, PA' 'Asheville, NC' 'State College, PA' 'Ithaca/Cortland, NY' 'Lincoln, NE' 'Nantucket, MA' 'Allentown/Bethlehem/Easton, PA' 'Dayton, OH' 'Roanoke, VA' 'Bangor, ME' 'Peoria, IL' 'Saginaw/Bay City/Midland, MI' 'Moline, IL' 'Lexington, KY' 'Kalamazoo, MI' 'Lansing, MI' 'Appleton, WI' 'Hayden, CO' 'Montrose/Delta, CO' 'Pasco/Kennewick/Richland, WA' 'Bend/Redmond, OR' 'Casper, WY' 'Midland/Odessa, TX' 'Eagle, CO' 'Gunnison, CO' 'Elmira/Corning, NY' 'Rapid City, SD' 'Medford, OR' 'Brownsville, TX' 'South Bend, IN' 'Cody, WY' 'Great Falls, MT' 'Kodiak, AK' 'Bethel, AK' 'Barrow, AK' 'Deadhorse, AK' 'Juneau, AK' 'Ketchikan, AK' 'Yakutat, AK' 'Cordova, AK' 'Sitka, AK' 'Petersburg, AK' 'Wrangell, AK' 'Fairbanks, AK' 'Dallas, TX' 'Nome, AK' 'Kotzebue, AK' 'Honolulu, HI' 'Kahului, HI' 'Lihue, HI' 'Kona, HI' 'Adak Island, AK' 'Everett, WA' 'King Salmon, AK' 'Gustavus, AK' 'Dillingham, AK' 'Santa Rosa, CA' 'Cold Bay, AK' 'Eugene, OR' 'Bellingham, WA' 'White Plains, NY' 'Charleston/Dunbar, WV' 'Springfield, IL' 'Duluth, MN' 'Columbia, SC' 'Erie, PA' 'Charlottesville, VA' 'Chattanooga, TN' 'Panama City, FL' 'Fort Wayne, IN' 'Columbia, MO' 'Mosinee, WI' 'Evansville, IN' 'Valparaiso, FL' 'Waterloo, IA' 'La Crosse, WI' 'Rochester, MN' 'Fayetteville, NC' 'Newport News/Williamsburg, VA' 'Champaign/Urbana, IL' 'Baton Rouge, LA' 'Shreveport, LA' 'Flagstaff, AZ' 'Dickinson, ND' 'Texarkana, AR' 'Gulfport/Biloxi, MS' 'Laredo, TX' 'Corpus Christi, TX' 'Mission/McAllen/Edinburg, TX' 'Lubbock, TX' 'Santa Fe, NM' 'Lake Charles, LA' 'Amarillo, TX' 'Alexandria, LA' 'Idaho Falls, ID' 'Helena, MT' 'Hobbs, NM' 'College Station/Bryan, TX' 'Killeen, TX' 'Monroe, LA' 'Minot, ND' 'Long Beach, CA' 'Hilo, HI' 'Pago Pago, TT' 'Montgomery, AL' 'Greenville, NC' 'Bristol/Johnson City/Kingsport, TN' 'Lynchburg, VA' 'Florence, SC' 'Ashland, WV' 'Augusta, GA' 'Watertown, NY' 'Tallahassee, FL' 'Salisbury, MD' 'Gainesville, FL' 'Jacksonville/Camp Lejeune, NC' 'Toledo, OH' 'Columbus, GA' 'New Bern/Morehead/Beaufort, NC' 'Williamsport, PA' 'Worcester, MA' 'Melbourne, FL' 'Atlantic City, NJ' 'Latrobe, PA' 'Christiansted, VI' 'Niagara Falls, NY' 'Ponce, PR' 'Plattsburgh, NY' 'Yakima, WA' 'Pullman, WA' 'Walla Walla, WA' 'Wenatchee, WA' 'San Luis Obispo, CA' 'Sun Valley/Hailey/Ketchum, ID' 'Redding, CA' 'Monterey, CA' 'Lewiston, ID' 'Mammoth Lakes, CA' 'Key West, FL' 'Yuma, AZ' 'Bakersfield, CA' 'Daytona Beach, FL' 'Fort Smith, AR' 'Lawton/Fort Sill, OK' 'Roswell, NM' 'Joplin, MO' 'Grand Island, NE'] OriginStateName ['Nevada' 'California' 'Colorado' 'Florida' 'New York' 'Pennsylvania' 'Ohio' 'Maryland' 'Arizona' 'New Jersey' 'Illinois' 'Georgia' 'Michigan' 'Texas' 'Arkansas' 'Massachusetts' 'Virginia' 'Rhode Island' 'Minnesota' 'Wisconsin' 'Utah' 'North Carolina' 'Puerto Rico' 'Louisiana' 'Maine' 'Oregon' 'Kentucky' 'Connecticut' 'Nebraska' 'Washington' 'Missouri' 'Indiana' 'Iowa' 'Tennessee' 'Oklahoma' 'New Mexico' 'South Dakota' 'North Dakota' 'Alabama' 'Delaware' 'Vermont' 'South Carolina' 'U.S. Virgin Islands' 'Montana' 'Kansas' 'Wyoming' 'Mississippi' 'Idaho' 'Alaska' 'New Hampshire' 'Hawaii' 'West Virginia' 'U.S. Pacific Trust Territories and Possessions'] DestCityName ['San Diego, CA' 'Las Vegas, NV' 'Orlando, FL' 'Fort Myers, FL' 'Buffalo, NY' 'Miami, FL' 'Philadelphia, PA' 'Tampa, FL' 'Cleveland, OH' 'Baltimore, MD' 'Chicago, IL' 'Denver, CO' 'West Palm Beach/Palm Beach, FL' 'Trenton, NJ' 'Atlanta, GA' 'Islip, NY' 'Grand Rapids, MI' 'Houston, TX' 'Little Rock, AR' 'Newark, NJ' 'Washington, DC' 'Phoenix, AZ' 'Reno, NV' 'Providence, RI' 'Dallas/Fort Worth, TX' 'Minneapolis, MN' 'Madison, WI' 'Ontario, CA' 'Salt Lake City, UT' 'Sacramento, CA' 'Oakland, CA' 'Raleigh/Durham, NC' 'San Juan, PR' 'Santa Ana, CA' 'San Francisco, CA' 'New York, NY' 'New Orleans, LA' 'Portland, ME' 'Cincinnati, OH' 'Hartford, CT' 'Detroit, MI' 'Boston, MA' 'Charlotte, NC' 'Omaha, NE' 'Fayetteville, AR' 'St. Louis, MO' 'Kansas City, MO' 'Pensacola, FL' 'Syracuse, NY' 'Indianapolis, IN' 'Rochester, NY' 'Jacksonville, FL' 'Burbank, CA' 'Milwaukee, WI' 'Austin, TX' 'Cedar Rapids/Iowa City, IA' 'Colorado Springs, CO' 'Norfolk, VA' 'Harrisburg, PA' 'Nashville, TN' 'Tulsa, OK' 'Seattle, WA' 'Knoxville, TN' 'Newburgh/Poughkeepsie, NY' 'Fresno, CA' 'Portland, OR' 'Albuquerque, NM' 'Sioux Falls, SD' 'El Paso, TX' 'Green Bay, WI' 'Fargo, ND' 'Bloomington/Normal, IL' 'Des Moines, IA' 'Harlingen/San Benito, TX' 'Tucson, AZ' 'San Antonio, TX' 'Huntsville, AL' 'Wilmington, DE' 'Oklahoma City, OK' 'Durango, CO' 'Sarasota/Bradenton, FL' 'Spokane, WA' 'Memphis, TN' 'Columbus, OH' 'Charlotte Amalie, VI' 'Louisville, KY' 'Burlington, VT' 'Albany, NY' 'Pittsburgh, PA' 'Fort Lauderdale, FL' 'Charleston, SC' 'Bismarck/Mandan, ND' 'Myrtle Beach, SC' 'Aguadilla, PR' 'Savannah, GA' 'Grand Junction, CO' 'Branson, MO' 'Kalispell, MT' 'Missoula, MT' 'Los Angeles, CA' 'Bozeman, MT' 'Palm Springs, CA' 'San Jose, CA' 'Billings, MT' 'Wichita, KS' 'Jackson, WY' 'Jackson/Vicksburg, MS' 'Boise, ID' 'Anchorage, AK' 'Greer, SC' 'Mobile, AL' 'Tyler, TX' 'Santa Barbara, CA' 'Portsmouth, NH' 'Lafayette, LA' 'Birmingham, AL' 'Greensboro/High Point, NC' 'Wilmington, NC' 'Richmond, VA' 'Springfield, MO' 'Traverse City, MI' 'Manchester, NH' 'Akron, OH' 'Presque Isle/Houlton, ME' 'Hilton Head, SC' 'Flint, MI' 'Scranton/Wilkes-Barre, PA' 'State College, PA' 'Ithaca/Cortland, NY' 'Asheville, NC' 'Lincoln, NE' 'Nantucket, MA' 'Dayton, OH' 'Allentown/Bethlehem/Easton, PA' 'Roanoke, VA' 'Bangor, ME' 'Peoria, IL' 'Saginaw/Bay City/Midland, MI' 'Moline, IL' 'Lexington, KY' 'Kalamazoo, MI' 'Lansing, MI' 'Appleton, WI' 'Hayden, CO' 'Montrose/Delta, CO' 'Pasco/Kennewick/Richland, WA' 'Bend/Redmond, OR' 'Casper, WY' 'Midland/Odessa, TX' 'Eagle, CO' 'Gunnison, CO' 'Elmira/Corning, NY' 'Rapid City, SD' 'Medford, OR' 'Brownsville, TX' 'South Bend, IN' 'Cody, WY' 'Great Falls, MT' 'Bethel, AK' 'Barrow, AK' 'Deadhorse, AK' 'Ketchikan, AK' 'Juneau, AK' 'Cordova, AK' 'Yakutat, AK' 'Sitka, AK' 'Wrangell, AK' 'Petersburg, AK' 'Honolulu, HI' 'Kodiak, AK' 'Fairbanks, AK' 'Nome, AK' 'Kotzebue, AK' 'Dallas, TX' 'Kahului, HI' 'Kona, HI' 'Lihue, HI' 'Adak Island, AK' 'Everett, WA' 'King Salmon, AK' 'Gustavus, AK' 'Dillingham, AK' 'Santa Rosa, CA' 'Cold Bay, AK' 'Eugene, OR' 'Bellingham, WA' 'Columbia, SC' 'Erie, PA' 'Charlottesville, VA' 'Springfield, IL' 'Charleston/Dunbar, WV' 'White Plains, NY' 'Duluth, MN' 'Panama City, FL' 'Chattanooga, TN' 'Fort Wayne, IN' 'Evansville, IN' 'Columbia, MO' 'Mosinee, WI' 'Valparaiso, FL' 'Waterloo, IA' 'La Crosse, WI' 'Rochester, MN' 'Fayetteville, NC' 'Newport News/Williamsburg, VA' 'Champaign/Urbana, IL' 'Gulfport/Biloxi, MS' 'Lubbock, TX' 'Shreveport, LA' 'Amarillo, TX' 'Corpus Christi, TX' 'Baton Rouge, LA' 'Laredo, TX' 'Texarkana, AR' 'Alexandria, LA' 'Dickinson, ND' 'Mission/McAllen/Edinburg, TX' 'Flagstaff, AZ' 'Lake Charles, LA' 'Santa Fe, NM' 'Idaho Falls, ID' 'Helena, MT' 'Hobbs, NM' 'Killeen, TX' 'College Station/Bryan, TX' 'Monroe, LA' 'Minot, ND' 'Long Beach, CA' 'Hilo, HI' 'Pago Pago, TT' 'Lynchburg, VA' 'Bristol/Johnson City/Kingsport, TN' 'Watertown, NY' 'Salisbury, MD' 'Florence, SC' 'Greenville, NC' 'Augusta, GA' 'Montgomery, AL' 'Tallahassee, FL' 'Ashland, WV' 'Gainesville, FL' 'Jacksonville/Camp Lejeune, NC' 'Toledo, OH' 'Columbus, GA' 'New Bern/Morehead/Beaufort, NC' 'Williamsport, PA' 'Worcester, MA' 'Melbourne, FL' 'Atlantic City, NJ' 'Latrobe, PA' 'Christiansted, VI' 'Niagara Falls, NY' 'Ponce, PR' 'Plattsburgh, NY' 'Yakima, WA' 'Walla Walla, WA' 'Pullman, WA' 'Wenatchee, WA' 'San Luis Obispo, CA' 'Sun Valley/Hailey/Ketchum, ID' 'Redding, CA' 'Monterey, CA' 'Lewiston, ID' 'Mammoth Lakes, CA' 'Key West, FL' 'Yuma, AZ' 'Bakersfield, CA' 'Daytona Beach, FL' 'Fort Smith, AR' 'Lawton/Fort Sill, OK' 'Roswell, NM' 'Joplin, MO' 'Grand Island, NE'] DestStateName ['California' 'Nevada' 'Florida' 'New York' 'Pennsylvania' 'Ohio' 'Maryland' 'Illinois' 'Colorado' 'New Jersey' 'Georgia' 'Michigan' 'Texas' 'Arkansas' 'Virginia' 'Arizona' 'Rhode Island' 'Minnesota' 'Wisconsin' 'Utah' 'North Carolina' 'Puerto Rico' 'Louisiana' 'Maine' 'Kentucky' 'Connecticut' 'Massachusetts' 'Nebraska' 'Missouri' 'Indiana' 'Iowa' 'Tennessee' 'Oklahoma' 'Washington' 'Oregon' 'New Mexico' 'South Dakota' 'North Dakota' 'Alabama' 'Delaware' 'U.S. Virgin Islands' 'Vermont' 'South Carolina' 'Montana' 'Kansas' 'Wyoming' 'Mississippi' 'Idaho' 'Alaska' 'New Hampshire' 'Hawaii' 'West Virginia' 'U.S. Pacific Trust Territories and Possessions'] Origin ['LAS' 'SAN' 'DEN' 'RSW' 'BUF' 'MCO' 'MIA' 'PHL' 'TPA' 'CLE' 'BWI' 'PHX' 'TTN' 'PBI' 'ORD' 'ATL' 'ISP' 'GRR' 'IAH' 'LIT' 'BOS' 'DCA' 'RNO' 'PVD' 'DFW' 'MSP' 'MSN' 'ONT' 'SLC' 'SMF' 'OAK' 'RDU' 'SJU' 'SNA' 'SFO' 'MSY' 'PWM' 'PDX' 'CVG' 'EWR' 'BDL' 'CLT' 'OMA' 'DTW' 'XNA' 'SEA' 'MCI' 'STL' 'PNS' 'SYR' 'IND' 'ROC' 'JAX' 'LGA' 'BUR' 'MKE' 'AUS' 'CID' 'COS' 'ORF' 'MDT' 'BNA' 'TUL' 'TYS' 'SWF' 'FAT' 'ABQ' 'FSD' 'ELP' 'GRB' 'FAR' 'BMI' 'DSM' 'HRL' 'TUS' 'SAT' 'HSV' 'ILG' 'OKC' 'DRO' 'SRQ' 'GEG' 'CMH' 'MEM' 'SDF' 'BTV' 'ALB' 'PIT' 'FLL' 'CHS' 'STT' 'BIS' 'MYR' 'BQN' 'SAV' 'GJT' 'MDW' 'BKG' 'HOU' 'FCA' 'MSO' 'BZN' 'IAD' 'LAX' 'PSP' 'SJC' 'BIL' 'ICT' 'JAC' 'JAN' 'BOI' 'ANC' 'GSP' 'BFM' 'TYR' 'SBA' 'PSM' 'LFT' 'BHM' 'GSO' 'SGF' 'PQI' 'RIC' 'TVC' 'MHT' 'ILM' 'CAK' 'HHH' 'FNT' 'AVP' 'AVL' 'SCE' 'ITH' 'LNK' 'ACK' 'ABE' 'DAY' 'ROA' 'BGR' 'PIA' 'MBS' 'MLI' 'LEX' 'AZO' 'LAN' 'ATW' 'HDN' 'MTJ' 'PSC' 'RDM' 'CPR' 'MAF' 'EGE' 'GUC' 'ELM' 'RAP' 'MFR' 'BRO' 'SBN' 'COD' 'GTF' 'JFK' 'ADQ' 'BET' 'BRW' 'SCC' 'JNU' 'KTN' 'YAK' 'CDV' 'SIT' 'PSG' 'WRG' 'FAI' 'DAL' 'OME' 'OTZ' 'HNL' 'OGG' 'LIH' 'KOA' 'ADK' 'PAE' 'AKN' 'GST' 'DLG' 'STS' 'CDB' 'EUG' 'BLI' 'HPN' 'CRW' 'SPI' 'DLH' 'CAE' 'ERI' 'CHO' 'CHA' 'ECP' 'FWA' 'COU' 'CWA' 'EVV' 'VPS' 'ALO' 'LSE' 'RST' 'FAY' 'PHF' 'CMI' 'BTR' 'SHV' 'FLG' 'DIK' 'TXK' 'GPT' 'LRD' 'CRP' 'MFE' 'LBB' 'MOB' 'SAF' 'LCH' 'AMA' 'AEX' 'IDA' 'HLN' 'HOB' 'CLL' 'GRK' 'MLU' 'MOT' 'LGB' 'ITO' 'PPG' 'MGM' 'PGV' 'TRI' 'LYH' 'FLO' 'HTS' 'AGS' 'ART' 'TLH' 'SBY' 'GNV' 'OAJ' 'TOL' 'CSG' 'EWN' 'IPT' 'ORH' 'MLB' 'ACY' 'LBE' 'STX' 'IAG' 'PSE' 'PBG' 'YKM' 'PUW' 'ALW' 'EAT' 'SBP' 'SUN' 'RDD' 'MRY' 'LWS' 'MMH' 'EYW' 'YUM' 'BFL' 'DAB' 'FSM' 'LAW' 'ROW' 'JLN' 'GRI'] Dest ['SAN' 'LAS' 'MCO' 'RSW' 'BUF' 'MIA' 'PHL' 'TPA' 'CLE' 'BWI' 'ORD' 'DEN' 'PBI' 'TTN' 'ATL' 'ISP' 'GRR' 'IAH' 'LIT' 'EWR' 'DCA' 'PHX' 'RNO' 'PVD' 'DFW' 'MSP' 'MSN' 'ONT' 'SLC' 'SMF' 'OAK' 'RDU' 'SJU' 'SNA' 'SFO' 'LGA' 'MSY' 'PWM' 'CVG' 'BDL' 'DTW' 'BOS' 'CLT' 'OMA' 'XNA' 'STL' 'MCI' 'PNS' 'SYR' 'IND' 'ROC' 'JAX' 'BUR' 'MKE' 'AUS' 'CID' 'COS' 'ORF' 'MDT' 'BNA' 'TUL' 'SEA' 'TYS' 'SWF' 'FAT' 'PDX' 'ABQ' 'FSD' 'ELP' 'GRB' 'FAR' 'BMI' 'DSM' 'HRL' 'TUS' 'SAT' 'HSV' 'ILG' 'OKC' 'DRO' 'SRQ' 'GEG' 'MEM' 'CMH' 'STT' 'SDF' 'BTV' 'ALB' 'PIT' 'FLL' 'CHS' 'BIS' 'MYR' 'BQN' 'SAV' 'GJT' 'MDW' 'BKG' 'HOU' 'FCA' 'MSO' 'LAX' 'BZN' 'IAD' 'PSP' 'SJC' 'BIL' 'ICT' 'JAC' 'JAN' 'BOI' 'ANC' 'GSP' 'BFM' 'TYR' 'SBA' 'PSM' 'LFT' 'BHM' 'GSO' 'ILM' 'RIC' 'SGF' 'TVC' 'MHT' 'CAK' 'PQI' 'HHH' 'FNT' 'AVP' 'SCE' 'ITH' 'AVL' 'LNK' 'ACK' 'DAY' 'ABE' 'ROA' 'BGR' 'PIA' 'MBS' 'MLI' 'LEX' 'AZO' 'LAN' 'ATW' 'HDN' 'MTJ' 'PSC' 'RDM' 'CPR' 'MAF' 'EGE' 'GUC' 'ELM' 'RAP' 'MFR' 'BRO' 'SBN' 'COD' 'GTF' 'JFK' 'BET' 'BRW' 'SCC' 'KTN' 'JNU' 'CDV' 'YAK' 'SIT' 'WRG' 'PSG' 'HNL' 'ADQ' 'FAI' 'OME' 'OTZ' 'DAL' 'OGG' 'KOA' 'LIH' 'ADK' 'PAE' 'AKN' 'GST' 'DLG' 'STS' 'CDB' 'EUG' 'BLI' 'CAE' 'ERI' 'CHO' 'SPI' 'CRW' 'HPN' 'DLH' 'ECP' 'CHA' 'FWA' 'EVV' 'COU' 'CWA' 'VPS' 'ALO' 'LSE' 'RST' 'FAY' 'PHF' 'CMI' 'GPT' 'LBB' 'SHV' 'AMA' 'CRP' 'BTR' 'LRD' 'TXK' 'MOB' 'AEX' 'DIK' 'MFE' 'FLG' 'LCH' 'SAF' 'IDA' 'HLN' 'HOB' 'GRK' 'CLL' 'MLU' 'MOT' 'LGB' 'ITO' 'PPG' 'LYH' 'TRI' 'ART' 'SBY' 'FLO' 'PGV' 'AGS' 'MGM' 'TLH' 'HTS' 'GNV' 'OAJ' 'TOL' 'CSG' 'EWN' 'IPT' 'ORH' 'MLB' 'ACY' 'LBE' 'STX' 'IAG' 'PSE' 'PBG' 'YKM' 'ALW' 'PUW' 'EAT' 'SBP' 'SUN' 'RDD' 'MRY' 'LWS' 'MMH' 'EYW' 'YUM' 'BFL' 'DAB' 'FSM' 'LAW' 'ROW' 'JLN' 'GRI'] Tail_Number ['N206FR' 'N207FR' 'N211FR' ... 'N926LR' 'N243LR' 'N407SW'] Description ['Frontier Airlines Inc.' 'GoJet Airlines LLC d/b/a United Express' 'Alaska Airlines Inc.' 'Air Wisconsin Airlines Corp' 'CommuteAir LLC dba CommuteAir' 'Hawaiian Airlines Inc.' 'Piedmont Airlines' 'Spirit Air Lines' 'Horizon Air' 'Mesa Airlines Inc.']
#Finding missing values in the data
missing_values=[features for features in small_airlines.columns if small_airlines[features].isnull().sum()>0]
for feature in missing_values:
{print(feature,np.round(small_airlines[feature].isnull().mean(),4), '% of missing values')}
CarrierDelay 0.8086 % of missing values WeatherDelay 0.8086 % of missing values NASDelay 0.8086 % of missing values SecurityDelay 0.8086 % of missing values LateAircraftDelay 0.8086 % of missing values
#We can see that the causes of delay fields have many missing values which are null values and we are imputing these columns with 0 for our analysis
small_airlines['CarrierDelay'].fillna(0, inplace=True)
small_airlines['WeatherDelay'].fillna(0, inplace=True)
small_airlines['NASDelay'].fillna(0, inplace=True)
small_airlines['SecurityDelay'].fillna(0, inplace=True)
small_airlines['LateAircraftDelay'].fillna(0, inplace=True)
#To Checking distinct values for feature and decide if that column needs to be included in the analysis
for column in small_airlines.columns:
print(column,small_airlines[column].nunique())
year 6 month 12 flightdate 1915 DayOfWeek 7 DayofMonth 31 Operating_Airline 10 OriginCityName 273 OriginStateName 53 DestCityName 273 DestStateName 53 DepTime 1440 DepDelay 1540 DepDelayMinutes 1457 DepDel15 2 ArrTime 1440 ArrDelay 1580 ArrDelayMinutes 1468 ArrDel15 2 CRSDepTime 1359 CRSArrTime 1432 Distance 694 CarrierDelay 1315 WeatherDelay 984 NASDelay 905 SecurityDelay 208 LateAircraftDelay 1173 TaxiIn 204 TaxiOut 189 WheelsOff 1440 WheelsOn 1440 Origin 278 Dest 278 Tail_Number 1313 Description 10
#Identifying the number of categorical and numerical variables in dataset
numerical_variables = list(small_airlines.select_dtypes(include=['int64','float64']).drop('ArrDel15', axis=1).columns)
categorical_variables = list(small_airlines.select_dtypes(include=['object']).columns)
numerical_variables#24 numerical features
['year', 'month', 'DayOfWeek', 'DayofMonth', 'DepTime', 'DepDelay', 'DepDelayMinutes', 'DepDel15', 'ArrTime', 'ArrDelay', 'ArrDelayMinutes', 'CRSDepTime', 'CRSArrTime', 'Distance', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'TaxiIn', 'TaxiOut', 'WheelsOff', 'WheelsOn']
categorical_variables#10 categorical features
['flightdate', 'Operating_Airline', 'OriginCityName', 'OriginStateName', 'DestCityName', 'DestStateName', 'Origin', 'Dest', 'Tail_Number', 'Description']
#Finding the variable distribution for the data
plt.figure(figsize=(12,80), facecolor='white')
plotnumber =1
for cat in categorical_variables:
ax = plt.subplot(14,1,plotnumber)
sns.countplot(y=cat,data=small_airlines)
#plt.xlabel(categorical_variables)
plotnumber+=1
plt.show()
#Check target label split over categorical features and find the count for the dataset
for cat in categorical_variables:
print(small_airlines.groupby(['ArrDel15',cat]).size())
ArrDel15 flightdate
0.0 2018-01-01 2268
2018-01-02 2418
2018-01-03 2350
2018-01-04 2131
2018-01-05 2304
...
1.0 2023-04-26 713
2023-04-27 950
2023-04-28 911
2023-04-29 783
2023-04-30 1030
Length: 3830, dtype: int64
ArrDel15 Operating_Airline
0.0 AS 910950
C5 222734
F9 492849
G7 255105
HA 300887
NK 785013
PT 400531
QX 455119
YV 677316
ZW 331279
1.0 AS 192204
C5 74673
F9 169911
G7 63412
HA 50364
NK 200458
PT 78146
QX 75818
YV 163770
ZW 75002
dtype: int64
ArrDel15 OriginCityName
0.0 Adak Island, AK 325
Aguadilla, PR 2141
Akron, OH 6417
Albany, NY 15844
Albuquerque, NM 12700
...
1.0 Worcester, MA 151
Wrangell, AK 490
Yakima, WA 720
Yakutat, AK 513
Yuma, AZ 226
Length: 546, dtype: int64
ArrDel15 OriginStateName
0.0 Alabama 30962
Alaska 145991
Arizona 103746
Arkansas 13741
California 413171
...
1.0 Virginia 58157
Washington 98349
West Virginia 1306
Wisconsin 10414
Wyoming 784
Length: 106, dtype: int64
ArrDel15 DestCityName
0.0 Adak Island, AK 398
Aguadilla, PR 2060
Akron, OH 6453
Albany, NY 15048
Albuquerque, NM 12384
...
1.0 Worcester, MA 103
Wrangell, AK 649
Yakima, WA 644
Yakutat, AK 553
Yuma, AZ 254
Length: 546, dtype: int64
ArrDel15 DestStateName
0.0 Alabama 30386
Alaska 142753
Arizona 104473
Arkansas 13525
California 407030
...
1.0 Virginia 54907
Washington 97485
West Virginia 1235
Wisconsin 10972
Wyoming 726
Length: 106, dtype: int64
ArrDel15 Origin
0.0 ABE 3980
ABQ 12700
ACK 930
ACY 12732
ADK 325
...
1.0 WRG 490
XNA 1381
YAK 513
YKM 720
YUM 226
Length: 556, dtype: int64
ArrDel15 Dest
0.0 ABE 3913
ABQ 12384
ACK 1032
ACY 12273
ADK 398
...
1.0 WRG 649
XNA 1530
YAK 553
YKM 644
YUM 254
Length: 556, dtype: int64
ArrDel15 Tail_Number
0.0 N10156 1104
N101NK 20
N11106 814
N11107 897
N11113 933
...
1.0 N979NK 28
N980NK 45
N999FR 6
NZ52AW 18
NZ55AW 18
Length: 2625, dtype: int64
ArrDel15 Description
0.0 Air Wisconsin Airlines Corp 331279
Alaska Airlines Inc. 910950
CommuteAir LLC dba CommuteAir 222734
Frontier Airlines Inc. 492849
GoJet Airlines LLC d/b/a United Express 255105
Hawaiian Airlines Inc. 300887
Horizon Air 455119
Mesa Airlines Inc. 677316
Piedmont Airlines 400531
Spirit Air Lines 785013
1.0 Air Wisconsin Airlines Corp 75002
Alaska Airlines Inc. 192204
CommuteAir LLC dba CommuteAir 74673
Frontier Airlines Inc. 169911
GoJet Airlines LLC d/b/a United Express 63412
Hawaiian Airlines Inc. 50364
Horizon Air 75818
Mesa Airlines Inc. 163770
Piedmont Airlines 78146
Spirit Air Lines 200458
dtype: int64
#Exploring and analyzing the numerical variables
# list of numerical variables
numerical_features = [feature for feature in small_airlines.columns if ((small_airlines[feature].dtypes != 'O') & (feature not in ['outcome']))]
print('Number of numerical variables: ', len(numerical_features))
# visualise the numerical variables
small_airlines[numerical_features].head()
Number of numerical variables: 24
| year | month | DayOfWeek | DayofMonth | DepTime | DepDelay | DepDelayMinutes | DepDel15 | ArrTime | ArrDelay | ... | Distance | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | TaxiIn | TaxiOut | WheelsOff | WheelsOn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 2 | 2 | 1 | 1752 | -6.0 | 0.0 | 0.0 | 1902 | -11.0 | ... | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 15.0 | 1807 | 1857 |
| 1 | 2022 | 2 | 2 | 1 | 1951 | -7.0 | 0.0 | 0.0 | 2108 | -13.0 | ... | 48.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 18.0 | 2009 | 2057 |
| 2 | 2022 | 2 | 2 | 1 | 17 | 90.0 | 90.0 | 1.0 | 719 | 71.0 | ... | 214.0 | 71.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 16.0 | 33 | 707 |
| 3 | 2022 | 2 | 2 | 1 | 39 | 19.0 | 19.0 | 1.0 | 610 | 11.0 | ... | 185.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 16.0 | 55 | 600 |
| 4 | 2022 | 2 | 2 | 1 | 718 | -12.0 | 0.0 | 0.0 | 1025 | 0.0 | ... | 156.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 24.0 | 742 | 1018 |
5 rows × 24 columns
#Finding discrete numerical variables
discrete_feature=[feature for feature in numerical_features if len(small_airlines[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))
Discrete Variables Count: 5
#Identifying continuous numerical variables
continuous_features=[feature for feature in numerical_features if feature not in discrete_feature+['ArrDel15']]
print("Continuous feature Count {}".format(len(continuous_features)))
Continuous feature Count 19
#Distribution of the continuous numerical variables
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for continuous_feature in continuous_features:
ax = plt.subplot(12,3,plotnumber)
sns.distplot(small_airlines[continuous_feature])
plt.xlabel(continuous_feature)
plotnumber+=1
plt.show()
#boxplot to show target distribution with respect to numerical features
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for feature in continuous_features:
ax = plt.subplot(12,3,plotnumber)
sns.boxplot(x="ArrDel15", y= small_airlines[feature], data=small_airlines)
plt.xlabel(feature)
plotnumber+=1
plt.show()
#Identifying outliers on numerical features
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for numerical_feature in numerical_features:
ax = plt.subplot(12,3,plotnumber)
sns.boxplot(small_airlines[numerical_feature])
plt.xlabel(numerical_feature)
plotnumber+=1
plt.show()
df_numericals = small_airlines[numerical_variables]
df_numericals.head()
| year | month | DayOfWeek | DayofMonth | DepTime | DepDelay | DepDelayMinutes | DepDel15 | ArrTime | ArrDelay | ... | Distance | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | TaxiIn | TaxiOut | WheelsOff | WheelsOn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 2 | 2 | 1 | 1752 | -6.0 | 0.0 | 0.0 | 1902 | -11.0 | ... | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 15.0 | 1807 | 1857 |
| 1 | 2022 | 2 | 2 | 1 | 1951 | -7.0 | 0.0 | 0.0 | 2108 | -13.0 | ... | 48.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 18.0 | 2009 | 2057 |
| 2 | 2022 | 2 | 2 | 1 | 17 | 90.0 | 90.0 | 1.0 | 719 | 71.0 | ... | 214.0 | 71.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 16.0 | 33 | 707 |
| 3 | 2022 | 2 | 2 | 1 | 39 | 19.0 | 19.0 | 1.0 | 610 | 11.0 | ... | 185.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 16.0 | 55 | 600 |
| 4 | 2022 | 2 | 2 | 1 | 718 | -12.0 | 0.0 | 0.0 | 1025 | 0.0 | ... | 156.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 24.0 | 742 | 1018 |
5 rows × 23 columns
#Finding the correlation between numerical features
## Checking for correlation
cor_mat=df_numericals.corr()
fig = plt.figure(figsize=(15,7))
sns.heatmap(cor_mat,annot=True)
<Axes: >
#Checking if the dataset is balanced with respect to the target variable
sns.countplot(x='ArrDel15',data=small_airlines)
plt.show()
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.6)
axis = sns.countplot(x=small_airlines['Description'], data=small_airlines, order=small_airlines['Description'].value_counts().iloc[0:18].index, orient="v")
axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha='right')
plt.title('TOTAL NUMBER OF FLIGHTS BY AIRLINES', fontsize=24)
plt.xlabel('OPERATING AIRLINE', fontsize=18)
plt.ylabel('NUMBER OF FLIGHTS', fontsize=18)
plt.tight_layout()
plt.show()
This plot tells us that the Top 5 Small Airlines that have most domestic flights are: Alaska Airlines Inc. Spirit Airlines Mesa Airlines Inc. Frontier Airlines Inc. Horizon Air
small_airlines.Description.value_counts()#Double checking if the counts match
Alaska Airlines Inc. 1103154 Spirit Air Lines 985471 Mesa Airlines Inc. 841086 Frontier Airlines Inc. 662760 Horizon Air 530937 Piedmont Airlines 478677 Air Wisconsin Airlines Corp 406281 Hawaiian Airlines Inc. 351251 GoJet Airlines LLC d/b/a United Express 318517 CommuteAir LLC dba CommuteAir 297407 Name: Description, dtype: int64
Now we will calculate the Total Number of flights that are delayed by Airline
plt.figure(figsize=(20, 10))
small_airlines.groupby('Description').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('TOTAL DELAYED FLIGHTS BY AIRLINE', fontsize=20)
plt.xlabel('AIRLINE NAME', fontsize=16)
plt.ylabel('NUMBER OF FLIGHTS', fontsize=16)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()
As per the Total Number of delays per airline we can see that the Top 5 Airlines are as below:
Here Piedmont Airlines and Horizon Air have swapped places where Horizon Air is the only airline which was in the Top 5 flights list but ended up in the 6th position in terms of delay
small_airlines.ArrDel15.value_counts(normalize=True)
0.0 0.808593 1.0 0.191407 Name: ArrDel15, dtype: float64
Here we are trying to identify the percentage of airlines that were delayed as per our target variable (ArrDel15) and we can see that around 19.14% flights are delayed and around 80.85% of the flights have been on time.
# Now we calculate the percentage of flights delayed
df1_PFD = small_airlines[['Description', 'ArrDel15']]
df1_PFD.head()
| Description | ArrDel15 | |
|---|---|---|
| 0 | Frontier Airlines Inc. | 0.0 |
| 1 | Frontier Airlines Inc. | 0.0 |
| 2 | Frontier Airlines Inc. | 1.0 |
| 3 | Frontier Airlines Inc. | 0.0 |
| 4 | Frontier Airlines Inc. | 0.0 |
df1_PFD.shape
(5975541, 2)
# Non-normalized value_counts
df1_PFD.ArrDel15.value_counts()
0.0 4831783 1.0 1143758 Name: ArrDel15, dtype: int64
t1 = df1_PFD.groupby(['Description']).sum().reset_index()
t2 = small_airlines.Description.value_counts().rename_axis('Description').reset_index(name='TOTAL_FLIGHTS')
df_fp = pd.merge(t1, t2, on='Description')
df_fp['PERCENTAGE_DELAYED'] = round((df_fp['ArrDel15'] * 100) / df_fp['TOTAL_FLIGHTS'], 2)
df_fp.rename({'ArrDel15': 'DELAYED_FLIGHTS'}, axis=1, inplace=True)
df_fp.head(20)
| Description | DELAYED_FLIGHTS | TOTAL_FLIGHTS | PERCENTAGE_DELAYED | |
|---|---|---|---|---|
| 0 | Air Wisconsin Airlines Corp | 75002.0 | 406281 | 18.46 |
| 1 | Alaska Airlines Inc. | 192204.0 | 1103154 | 17.42 |
| 2 | CommuteAir LLC dba CommuteAir | 74673.0 | 297407 | 25.11 |
| 3 | Frontier Airlines Inc. | 169911.0 | 662760 | 25.64 |
| 4 | GoJet Airlines LLC d/b/a United Express | 63412.0 | 318517 | 19.91 |
| 5 | Hawaiian Airlines Inc. | 50364.0 | 351251 | 14.34 |
| 6 | Horizon Air | 75818.0 | 530937 | 14.28 |
| 7 | Mesa Airlines Inc. | 163770.0 | 841086 | 19.47 |
| 8 | Piedmont Airlines | 78146.0 | 478677 | 16.33 |
| 9 | Spirit Air Lines | 200458.0 | 985471 | 20.34 |
SUM = df_fp.DELAYED_FLIGHTS.sum()#The total flights count match with the above count
print('Total number of delayed flights:', SUM)
Total number of delayed flights: 1143758.0
SUM1= df_fp.PERCENTAGE_DELAYED.sum()# Here we can see that the total percentage is going above 100% the reason could be that most airlines have more percentage delay than the overall percentage delay i.e. greater than 19%
print('Total percentage of delayed flights:', SUM1)
Total percentage of delayed flights: 191.29999999999998
# Plot with the 19.14% as threshold value
plt.figure(figsize=(20, 10))
df_fp.groupby('Description').PERCENTAGE_DELAYED.sum().sort_values(ascending=False).plot.bar(fontsize=14)
plt.hlines(y=19.14, xmin=-1, xmax=18, colors='r', linestyles='solid', label='cutoff')
plt.title('PERCENTAGE OF DELAYED FLIGHTS BY AIRLINE', fontsize=20)
plt.xlabel('AIRLINE', fontsize=16)
plt.ylabel('PERCENTAGE', fontsize=16)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
plt.show()
--The top 5 Airlines by percentage has now changed we can see that Commute Air and GoJet Airlines were not part of our top 5 list earlier but in terms of delay percentage they are in the top 5. --Frontier airlines have arrived around 74.36% on time which is the lowest and the highest here is Horizon Air which is 85.72%, this is interesting as Horizon Air is 6th in the number of flights list and still has managed to maintain lower arrival delay times. --Frontier Airlines was 3rd in the number of flights but has the highest number of flight delays amongst them all. --Another point to note is for Commute Air which was 8th in the list of number of flights but is ranking 2nd in terms of delay
plt.figure(figsize=(20, 10))
small_airlines.groupby('Description').ArrDelay.sum().sort_values(ascending=False).plot.bar()
plt.title('TOTAL DELAYED MINUTES BY AIRLINE', fontsize=18)
plt.xlabel('AIRLINE', fontsize=14)
plt.ylabel('TOTAL DELAYED MINUTES', fontsize=14)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()
Here Alaska Airlines which was among the top 5 Airlines stands out as it has lower Arrival delay minutes as compared to the other airlines while Mesa Airlines tops the list
small_airlines.head(2)
| year | month | DayOfWeek | flightdate | Operating_Airline | OriginCityName | Dest | Origin | DestCityName | DestState | ... | Tail_Number | DOT_ID_Operating_Airline | Flight_Number_Operating_Airline | OriginStateName | DestStateName | ArrTimeBlk | CancellationCode | FirstDepTime | TotalAddGTime | LongestAddGTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 2 | 2.0 | 2022-02-01 | F9 | Las Vegas, NV | SAN | LAS | San Diego, CA | CA | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2022 | 2 | 2.0 | 2022-02-01 | F9 | San Diego, CA | LAS | SAN | Las Vegas, NV | NV | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 45 columns
#Calculating the Average Arrival Delay
test_3 = small_airlines[['Description', 'ArrDelay']]
test_3 = test_3.groupby(['Description']).mean().reset_index()
test_3.rename({'ArrDelay': 'AVG_DELAY'}, axis=1, inplace=True)
test_3 = test_3.sort_values(['AVG_DELAY']).reset_index(drop=True)
test_3.head(18)
| Description | AVG_DELAY | |
|---|---|---|
| 0 | Alaska Airlines Inc. | 0.363510 |
| 1 | Horizon Air | 1.024617 |
| 2 | Piedmont Airlines | 1.706491 |
| 3 | Hawaiian Airlines Inc. | 2.507848 |
| 4 | Air Wisconsin Airlines Corp | 5.381155 |
| 5 | Spirit Air Lines | 6.429750 |
| 6 | GoJet Airlines LLC d/b/a United Express | 7.030058 |
| 7 | Mesa Airlines Inc. | 8.140483 |
| 8 | Frontier Airlines Inc. | 9.971267 |
| 9 | CommuteAir LLC dba CommuteAir | 14.626418 |
We can see that Alaska Airlines is lower in terms of Average Arrival delay but, what catches the attention is Commute Air which was 4th in the list by Total Delayed minutes is ranking higher in the list of Average Arrival delay
#Plot shows the averages that we calculate above
fig = plt.figure(1, figsize=(15,9))
sns.set(font_scale=1.6)
ax = sns.barplot(x='AVG_DELAY', y='Description', data=test_3)
ax.set_title('AVERAGE ARRIVAL DELAY BY AIRLINE (mins)', fontsize=18)
ax.set_xlabel('AVERAGE DELAY BY AIRLINE (min)', fontsize=14)
plt.show()
#Top 20 destinations where these airlines travel the most
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.6)
axis = sns.countplot(x=small_airlines['DestCityName'], data=small_airlines, order=small_airlines['DestCityName'].value_counts().iloc[0:20].index)
axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha='right')
plt.title('Top 20 Destination Cities', fontsize=20)
plt.xlabel('20 Top Destination Cities (DestCityName)', fontsize=14)
plt.ylabel('Total Number of Flights', fontsize=14)
plt.tight_layout()
plt.show()
#Top 5 Airports that have the most Arrival Delay
plt.figure(figsize=(20, 10))
small_airlines.groupby('Dest').ArrDelay.sum().sort_values(ascending=False).plot.bar()
axis = sns.countplot(x=small_airlines['Dest'], data=small_airlines, order=small_airlines['Dest'].value_counts().iloc[0:5].index)
plt.title('TOTAL DELAYED MINUTES BY AIRLINE', fontsize=18)
plt.xlabel('DESTINATION AIRPORT', fontsize=14)
plt.ylabel('TOTAL DELAYED MINUTES', fontsize=14)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)
plt.show()
We can see that the top 5 Airports where delays have been observed with respect to Arrival are:
#Worse and Best months to travel based on flights delay
plt.figure(figsize=(20, 10))
small_airlines.groupby('month').ArrDelay.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per Month', fontsize=18)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()
We can see that the best months to travel or most travel is during the summer months that is June to August. Followed by February and December which are holiday seasons The worst month is September which could be due to start of the Fall season when the weather is changing and that could cause delays
#Day of Week vs Arrival Delay to check is there a specific day of the week when Arrival delays are high
plt.figure(figsize=(20, 10))
barplot =small_airlines.groupby('DayOfWeek').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per day of the week', fontsize=18)
plt.xlabel('Day of the week', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()
We can see that Friday's have the highest number of flight delays while Saturday's have the lowest number of flight delays, which could be due to more number of people travelling after work on Friday's for weekend trips. The 2nd highest is on Monday which could be because its the start of the week and people need to rejoin their work
#Day of Month vs Arrival Delay to check is there a specific day of the week when Arrival delays are high
plt.figure(figsize=(20, 10))
barplot =small_airlines.groupby('DayofMonth').ArrDel15.sum().sort_values(ascending=False).plot.bar()
plt.title('Number of Delayed Flights per day of the month', fontsize=18)
plt.xlabel('Day of the month', fontsize=14)
plt.ylabel('Number of Flights', fontsize=14)
plt.show()
#Analyze Departure vs Arrival Delays
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['hatch.linewidth'] = 1.8
fig = plt.figure(1, figsize=(15,9))
ax = sns.barplot(x="DepDelay", y="Description", data=small_airlines, color="green", ci=None)
ax = sns.barplot(x="ArrDelay", y="Description", data=small_airlines, color="r", hatch = '///', alpha = 0.0, ci=None)
ax.yaxis.label.set_visible(False)
plt.title('Departure vs Arrival Delay')
plt.xlabel('Mean delay (min)', fontsize=14, labelpad=10);
fig
Based on the above analysis it is clear that Arrival delays are lower for all the airlines while the departure delays are higher for all of them. The highest departure delay is for Commute Air, followed by Frontier Airlines and GoJet Airlines which shows that airlines try their best to overcome the departure delays by trying to reduce their causes of Arrival delays and reaching the destination airports on time. But this is only based on the factors that they can control.
#Most popular Destination Airport with Average Arrival Delays
t4 = small_airlines[['Dest', 'ArrDelay']]
t4 = t4.groupby(['Dest']).mean().reset_index()
t4.head(10)
| Dest | ArrDelay | |
|---|---|---|
| 0 | ABE | 2.193191 |
| 1 | ABQ | 6.713205 |
| 2 | ACK | 10.526746 |
| 3 | ACY | 5.982557 |
| 4 | ADK | -0.005871 |
| 5 | ADQ | -0.671023 |
| 6 | AEX | 14.659603 |
| 7 | AGS | 3.930588 |
| 8 | AKN | 2.007199 |
| 9 | ALB | 9.801680 |
t5 = small_airlines.Dest.value_counts().rename_axis('Dest').reset_index(name='FLIGHTS').iloc[0:30]
t5.head(30)
| Dest | FLIGHTS | |
|---|---|---|
| 0 | SEA | 495093 |
| 1 | ORD | 279372 |
| 2 | IAH | 242419 |
| 3 | DFW | 199013 |
| 4 | IAD | 187601 |
| 5 | PHL | 186390 |
| 6 | LAS | 184502 |
| 7 | MCO | 174800 |
| 8 | PDX | 168150 |
| 9 | DEN | 166584 |
| 10 | HNL | 158547 |
| 11 | LAX | 133642 |
| 12 | EWR | 123151 |
| 13 | CLT | 118138 |
| 14 | PHX | 115202 |
| 15 | FLL | 108796 |
| 16 | DTW | 100531 |
| 17 | SFO | 94607 |
| 18 | ATL | 84328 |
| 19 | OGG | 80517 |
| 20 | ANC | 75107 |
| 21 | SAN | 66275 |
| 22 | TPA | 54332 |
| 23 | BWI | 46093 |
| 24 | KOA | 44889 |
| 25 | CLE | 43445 |
| 26 | LIH | 42771 |
| 27 | BOS | 41619 |
| 28 | GEG | 41559 |
| 29 | SJC | 36654 |
t4_5 = pd.merge(t5, t4, on='Dest')
t4_5.head()
| Dest | FLIGHTS | ArrDelay | |
|---|---|---|---|
| 0 | SEA | 495093 | 0.949551 |
| 1 | ORD | 279372 | 6.321399 |
| 2 | IAH | 242419 | 4.903065 |
| 3 | DFW | 199013 | 10.597730 |
| 4 | IAD | 187601 | 6.041220 |
t4_5 = t4_5.set_index('Dest')
fig = plt.figure(figsize=(15,7)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as a
width = .3
t4_5.FLIGHTS.plot(kind='bar',color='grey',ax=ax, width=width, position=0)
t4_5.ArrDelay.plot(kind='bar',color='blue', ax=ax2, width=width, position=1)
plt.hlines(y=4.90, xmin=-1, xmax=30, colors='b', linestyles='dashed', label='cutoff')
#ax.grid(None, axis=1)
ax2.grid(None)
ax.set_title('MOST POPULAR AIRPORTS vs AVERAGE ARRIVAL DELAY', fontsize=18)
ax.set_ylabel('NUMBER OF FLIGHTS', fontsize=16)
ax2.set_ylabel('AVERAGE ARRIVAL DELAY (mins)')
ax.set_xlabel('DESTINATION AIRPORTS', fontsize=16)
plt.show()
We conclude the Exploratory Data Analysis here the next part we start working with the model and decide which columns to be used which will be covered in another python notebook